package nz.ac.massey.spidernetpn.webcrawler;

import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

public class BlacklistedWebpages {
	protected static final Set<String> BLACKLIST;
	
	// Extracted common file extensions from the website http://www.fileinfo.com/filetypes/
	static {
		String[] extensions = {
				// Meta and style information
				".dtd", ".css", ".cfm", ".js", ".xslt", ".xml", ".xsl",
				
				// Images
				".ico", ".jpeg", ".tff", ".3dm", ".ai", ".bmp", ".drw", ".dwg", ".dxf", 
				".eps", ".gif", ".indd", ".jpg", ".pct", ".pdf", ".pln", ".png", ".psd",
				".psp", ".qxd", ".qxp", ".svg", ".thm", ".tif",
				".ai",  //Adobe Illustrator File
				".drw", //Drawing File
				".dxf", //Drawing Exchange Format File
				".eps", //Encapsulated PostScript File
				".ps",  //PostScript File
				".svg", //Scalable Vector Graphics File
				".3dm", //Rhino 3D Model
				".dwg", //AutoCAD Drawing Database File
				".pln", //ArchiCAD Project File
				
				// Video and Flash content
				".mpeg", ".3g2", ".3gp", ".asf", ".asx", ".avi", ".flv", ".mkv", ".mov",
				".mp4", ".mpg", ".qt", ".rm", ".swf", ".vob", ".wmv",
				
				// Sound
				".midi", ".flac", ".ogg", ".m4a", ".mp4", ".aac", ".aif", ".iff", ".m3u", 
				".mid", ".midi", ".mp3", ".mpa", ".ra", ".wav", ".wma",
				
				// Text Files
				".doc", //Microsoft Word Document
				".docx",        //Microsoft Word Open XML Document
				".log", //Log File
				".msg", //Mail Message
				".pages",       //Pages Document
				".rtf", //Rich Text Format File
				".txt", //Plain Text File
				".wpd", //WordPerfect Document
				".wps", //Microsoft Works Word Processor Document
				
				// Data Files
				".123", //Lotus 1-2-3 Spreadsheet
				".accdb",       //Access 2007 Database File
				".csv", //Comma Separated Values File
				".dat", //Data File
				".doc", // Microsoft Word
				".docx", // Word 2007 XML based document.
				".db",  //Database File
				".dll", //Dynamic Link Library
				".mdb", //Microsoft Access Database
				".pps", //PowerPoint Slide Show
				".ppt", //PowerPoint Presentation
				".pptx",        //Microsoft PowerPoint Open XML Document
				".sdb", //OpenOffice.org Base Database File
				".sql", //Structured Query Language Data
				".vcf", //vCard File
				".wks", //Microsoft Works Spreadsheet
				".xls", //Microsoft Excel Spreadsheet
				".xlsx",        //Microsoft Excel Open XML Document
				".xml", //XML File
				".odt", // Openoffice document
				".ods", // Openoffice spreadsheet
				".odp", // Openoffice presentation
				
				// Font Files
				".fnt", //Windows Font File
				".fon", //Generic Font File
				".otf", //OpenType Font
				".ttf", //TrueType Font
		
				// Plugin Files
				".8bi", //Photoshop Plug-in
				".plugin",      //Mac OS X Plug-in
				".xll", //Excel Add-In File
				
				// Page Layout Files
				".indd",        //Adobe InDesign File
				".pdf", //Portable Document Format File
				".qxd", //QuarkXPress Document
				".qxp", //QuarkXPress Project File
				
				// System Files
				".cab", //Windows Cabinet File
				".cpl", //Windows Control Panel
				".cur", //Windows Cursor
				".dmp", //Windows Memory Dump
				".drv", //Device Driver
				".key", //Security Key
				".lnk", //File Shortcut
				".sys", //Windows System File
		
				// Settings Files
				".cfg", //Configuration File
				".ini", //Windows Initialization File
				".prf", //Outlook Profile File
		
				// Executable Files
				".app", //Mac OS X Application
				".bat", //DOS Batch File
				".cgi", //Common Gateway Interface Script
				".com", //DOS Command File
				".exe", //Windows Executable File
				".pif", //Program Information File
				".vb",  //VBScript File
				".ws",  //Windows Script
		
				// Compressed Files
				".7z",  //7-Zip Compressed File
				".deb", //Debian Software Package
				".gz",  //Gnu Zipped File
				".pkg", //Mac OS X Installer Package
				".rar", //WinRAR Compressed Archive
				".sea", //Self-Extracting Archive
				".sit", //Stuffit Archive
				".sitx",        //Stuffit X Archive
				".zip", //Zipped File
				".zipx",        //Extended Zip File
		
				// Encoded Files   
				".bin", //Macbinary II Encoded File
				".hqx", //BinHex 4.0 Encoded File
				".mim", //Multi-Purpose Internet Mail Message
				".uue", //Uuencoded File
		
				// Developer Files
				".c",   //C/C++ Source Code File
				".cpp", //C++ Source Code File
				".java",        //Java Source Code File
				".pl",  //Perl Script
		
				// Backup Files
				".bak", //Backup File
				".bup", //Backup File
				".gho", //Norton Ghost Backup File
				".ori", //Original File
				".tmp", //Temporary File 
		
				// Disk Files
				".dmg", //Mac OS X Disk Image
				".iso", //Disc Image File 
				".toast",       //Toast Disc Image 
				".vcd", //Virtual CD
		
				// Game Files
				".gam", //Saved Game File
				".nes", //Nintendo (NES) ROM File
				".rom", //N64 Game ROM File
				".sav", //Saved Game
		
				// Misc Files
				".msi", //Windows Installer Package
				".part",        //Partially Downloaded File
				".torrent",     //BitTorrent File
				".yps", //Yahoo! Messenger Data File
		};
		
		HashSet<String> blacklist = new HashSet<String>();
		for(String extension : extensions)
			blacklist.add(extension);
		
		BLACKLIST = Collections.unmodifiableSet(blacklist);
	}
}
